Lab1

data 2a

data <- read.table(file="/Users/imac/Desktop/master/notes/DA/data/data2a.csv",header = TRUE,sep=',')
pairs(data)

model <- lm(data$Y~data$x2+data$x3+data$x4)
summary(model)
## 
## Call:
## lm(formula = data$Y ~ data$x2 + data$x3 + data$x4)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.3333 -0.6313  0.0616  0.6321  1.7456 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.13001    0.09833  -1.322    0.189    
## data$x2      0.98001    0.10522   9.314 4.44e-15 ***
## data$x3     -1.08265    0.10070 -10.751  < 2e-16 ***
## data$x4     -0.11460    0.10075  -1.137    0.258    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.973 on 96 degrees of freedom
## Multiple R-squared:  0.6716, Adjusted R-squared:  0.6614 
## F-statistic: 65.45 on 3 and 96 DF,  p-value: < 2.2e-16

Now we need to check the residual part, cos y=bx+e

#### data 2a - task 1b
resid <- residuals(model)

par(mfrow=c(1,2))
hist(resid)
qqnorm(resid)
qqline(resid)

## Looks normal,qqplot shows a straight line

par(mfrow=c(1,1))
plot(model$fitted.values, resid, pch=19)

## Looks random so constant variance seems reasonable, and if fan out,which means unreasonable.

par(mfrow=c(2,2))
plot(data$x2, resid)
plot(data$x3, resid)
plot(data$x4, resid)
## The assumed relationships look fine

#### data 2a - task 1c
summary(model)
## 
## Call:
## lm(formula = data$Y ~ data$x2 + data$x3 + data$x4)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.3333 -0.6313  0.0616  0.6321  1.7456 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.13001    0.09833  -1.322    0.189    
## data$x2      0.98001    0.10522   9.314 4.44e-15 ***
## data$x3     -1.08265    0.10070 -10.751  < 2e-16 ***
## data$x4     -0.11460    0.10075  -1.137    0.258    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.973 on 96 degrees of freedom
## Multiple R-squared:  0.6716, Adjusted R-squared:  0.6614 
## F-statistic: 65.45 on 3 and 96 DF,  p-value: < 2.2e-16
confint(model)
##                  2.5 %      97.5 %
## (Intercept) -0.3251901  0.06516203
## data$x2      0.7711463  1.18887548
## data$x3     -1.2825427 -0.88275202
## data$x4     -0.3145864  0.08538626
## You can see that x2 and x3 are significantly related to the response 
## but x4 is not.


#### data2a - task 1d
summary(lm(data$Y~data$x2+data$x3+data$x4))
## 
## Call:
## lm(formula = data$Y ~ data$x2 + data$x3 + data$x4)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.3333 -0.6313  0.0616  0.6321  1.7456 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.13001    0.09833  -1.322    0.189    
## data$x2      0.98001    0.10522   9.314 4.44e-15 ***
## data$x3     -1.08265    0.10070 -10.751  < 2e-16 ***
## data$x4     -0.11460    0.10075  -1.137    0.258    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.973 on 96 degrees of freedom
## Multiple R-squared:  0.6716, Adjusted R-squared:  0.6614 
## F-statistic: 65.45 on 3 and 96 DF,  p-value: < 2.2e-16
summary(lm(data$Y~data$x2+data$x3))
## 
## Call:
## lm(formula = data$Y ~ data$x2 + data$x3)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.34763 -0.70301  0.03172  0.66794  1.80603 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.14495    0.09759  -1.485    0.141    
## data$x2      0.98217    0.10536   9.322 3.91e-15 ***
## data$x3     -1.06386    0.09949 -10.693  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9744 on 97 degrees of freedom
## Multiple R-squared:  0.6672, Adjusted R-squared:  0.6603 
## F-statistic: 97.23 on 2 and 97 DF,  p-value: < 2.2e-16
summary(lm(data$Y~data$x2+data$x4))
## 
## Call:
## lm(formula = data$Y ~ data$x2 + data$x4)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.0685 -0.8897 -0.0170  0.7036  3.9000 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.11539    0.14520  -0.795    0.429    
## data$x2      0.94407    0.15532   6.078 2.39e-08 ***
## data$x4      0.06309    0.14678   0.430    0.668    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.437 on 97 degrees of freedom
## Multiple R-squared:  0.2763, Adjusted R-squared:  0.2613 
## F-statistic: 18.51 on 2 and 97 DF,  p-value: 1.549e-07
summary(lm(data$Y~data$x3+data$x4))
## 
## Call:
## lm(formula = data$Y ~ data$x3 + data$x4)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.8165 -0.9257  0.0923  0.9215  2.5266 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.08897    0.13482  -0.660    0.511    
## data$x3     -1.05285    0.13815  -7.621  1.7e-11 ***
## data$x4     -0.13156    0.13826  -0.952    0.344    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.335 on 97 degrees of freedom
## Multiple R-squared:  0.3749, Adjusted R-squared:  0.362 
## F-statistic: 29.09 on 2 and 97 DF,  p-value: 1.271e-10
## You know from the previous analysis that x2 and x3 are important but not x4.
## Comparing a model with all three variables with ones with each combimation of
## pairs shows the model without x4 is as good. Therefore as it is simpler it
## is the model of choice.

data2b - respiratory disease risk in Scotland

data <- read.table(file="/Users/imac/Desktop/master/notes/DA/data/data2b.csv",header = TRUE,sep=',')
head(data)
##          IG  SMR_resp  Easting Northing pm10      Area income_deprived
## 1 S02000001 0.7852600 394773.5 800776.8 12.0   97.3749             2.8
## 2 S02000002 0.9751268 393809.5 801434.5 13.0  601.3972             9.4
## 3 S02000003 0.6516183 381655.6 801983.2 11.0 1975.0080             6.4
## 4 S02000004 0.4053273 389061.4 802774.9 11.5  555.7517             3.7
## 5 S02000005 1.0002806 395490.3 802979.1 12.8  675.5862             5.4
## 6 S02000006 0.7734467 393353.8 803366.8 13.5  122.8432            15.4
##   JSA_perc Price_median Sales Band_AC Band_FH ethnic_perc total_crime
## 1    0.625       105000   101  26.479  12.254        2.16         179
## 2    1.450       105123   118  73.882   8.232        1.60         462
## 3    1.225        87000   158  51.923  23.971        4.50         211
## 4    0.325       214000   151   5.465  73.795        6.23         231
## 5    1.200        78189   106  70.205   3.050        1.68         292
## 6    1.775        58000    93  86.205   3.949        1.60         682
pairs(data[ ,-1])

model <- lm(data$SMR_resp~data$pm10 + data$income_deprived + data$Band_AC + data$ethnic_perc)
summary(model)
## 
## Call:
## lm(formula = data$SMR_resp ~ data$pm10 + data$income_deprived + 
##     data$Band_AC + data$ethnic_perc)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.47836 -0.11427 -0.01272  0.10013  0.99427 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           0.0658227  0.0425009   1.549   0.1217    
## data$pm10             0.0288424  0.0033814   8.530  < 2e-16 ***
## data$income_deprived  0.0188497  0.0009226  20.430  < 2e-16 ***
## data$Band_AC          0.0013626  0.0003084   4.418 1.08e-05 ***
## data$ethnic_perc     -0.0025075  0.0008134  -3.083   0.0021 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1768 on 1230 degrees of freedom
## Multiple R-squared:  0.6115, Adjusted R-squared:  0.6103 
## F-statistic: 484.1 on 4 and 1230 DF,  p-value: < 2.2e-16
## There is no one single correct model, this looks ok

resid <- residuals(model)

par(mfrow=c(1,2))
hist(resid)
qqnorm(resid)
qqline(resid)

## Looks normal apart from the upper tail, not too bad

par(mfrow=c(1,1))
plot(model$fitted.values, resid, pch=19)

## Again fairly random, not too bad

par(mfrow=c(4,3))
plot(data[ ,3], resid, pch=19)
plot(data[ ,4], resid, pch=19)
plot(data[ ,5], resid, pch=19)
plot(data[ ,6], resid, pch=19)
plot(data[ ,7], resid, pch=19)
plot(data[ ,8], resid, pch=19)
plot(data[ ,9], resid, pch=19)
plot(data[ ,10], resid, pch=19)
plot(data[ ,11], resid, pch=19)
plot(data[ ,12], resid, pch=19)
plot(data[ ,13], resid, pch=19)
plot(data[ ,14], resid, pch=19)

## Again looks fine

summary(model)
## 
## Call:
## lm(formula = data$SMR_resp ~ data$pm10 + data$income_deprived + 
##     data$Band_AC + data$ethnic_perc)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.47836 -0.11427 -0.01272  0.10013  0.99427 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           0.0658227  0.0425009   1.549   0.1217    
## data$pm10             0.0288424  0.0033814   8.530  < 2e-16 ***
## data$income_deprived  0.0188497  0.0009226  20.430  < 2e-16 ***
## data$Band_AC          0.0013626  0.0003084   4.418 1.08e-05 ***
## data$ethnic_perc     -0.0025075  0.0008134  -3.083   0.0021 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1768 on 1230 degrees of freedom
## Multiple R-squared:  0.6115, Adjusted R-squared:  0.6103 
## F-statistic: 484.1 on 4 and 1230 DF,  p-value: < 2.2e-16
confint(model)
##                              2.5 %        97.5 %
## (Intercept)          -0.0175594969  0.1492049430
## data$pm10             0.0222084930  0.0354763841
## data$income_deprived  0.0170395610  0.0206597728
## data$Band_AC          0.0007575387  0.0019677066
## data$ethnic_perc     -0.0041033585 -0.0009116789
## So air pollution is significantly related to SMR of respiratory disease.
## If pm10 increases by 1 unit then SMR increases by 0.0288 or 2.8%.

Lab2

Task 1 - correlated covariates and confounding

If two vaiables have linear relationship (x,z), we are trying to estimate 3 coefficient from 2 covariates(an intercept and z) \[y_i ~ N(\beta_1+\beta_2x_i+\beta_3z_i,\sigma^2)\]

#### data3a exactly linar related
data <- read.table(file="/Users/imac/Desktop/master/notes/DA/data/data3a.csv",header = TRUE,sep=',')
pairs(data)

model <- lm(data$y~data$x + data$z)
summary(model)
## 
## Call:
## lm(formula = data$y ~ data$x + data$z)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.26233 -0.62321 -0.02023  0.59117  2.17078 
## 
## Coefficients: (1 not defined because of singularities)
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   5.2313     0.2389   21.90   <2e-16 ***
## data$x        1.9788     0.0438   45.18   <2e-16 ***
## data$z            NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8091 on 98 degrees of freedom
## Multiple R-squared:  0.9542, Adjusted R-squared:  0.9537 
## F-statistic:  2041 on 1 and 98 DF,  p-value: < 2.2e-16
## All three cannot be fitted, since you are trying to estimate 3 coefficients from only two covariates(an intercept and z)

#### data3b nearly linear related
data <- read.table(file="/Users/imac/Desktop/master/notes/DA/data/data3b.csv",header = TRUE,sep=',')
pairs(data)

model1 <- lm(data$y~data$x + data$z)
summary(model1)
## 
## Call:
## lm(formula = data$y ~ data$x + data$z)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.43012 -0.59639  0.00043  0.66785  2.94746 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 10.09786    0.09309 108.474  < 2e-16 ***
## data$x       0.08249    0.75919   0.109 0.913701    
## data$z       3.01497    0.77725   3.879 0.000191 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9164 on 97 degrees of freedom
## Multiple R-squared:  0.9157, Adjusted R-squared:  0.9139 
## F-statistic: 526.7 on 2 and 97 DF,  p-value: < 2.2e-16
model2 <- lm(data$y~data$x)
summary(model2)
## 
## Call:
## lm(formula = data$y ~ data$x)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.36558 -0.69790 -0.01382  0.83061  2.49726 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 10.03442    0.09799  102.40   <2e-16 ***
## data$x       3.00510    0.09972   30.14   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9799 on 98 degrees of freedom
## Multiple R-squared:  0.9026, Adjusted R-squared:  0.9016 
## F-statistic: 908.1 on 1 and 98 DF,  p-value: < 2.2e-16
## much more sensible

#### data 3c uncorrelated
data <- read.table(file="/Users/imac/Desktop/master/notes/DA/data/data3c.csv",header = TRUE,sep=',')
pairs(data)

model1 <- lm(data$y~data$x + data$z)
summary(model1)
## 
## Call:
## lm(formula = data$y ~ data$x + data$z)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.88766 -1.02881  0.07436  0.78877  3.02226 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  10.0182     0.1225  81.761  < 2e-16 ***
## data$x        1.0541     0.1322   7.973 3.06e-12 ***
## data$z        1.9325     0.1185  16.305  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.195 on 97 degrees of freedom
## Multiple R-squared:  0.8116, Adjusted R-squared:  0.8077 
## F-statistic: 208.9 on 2 and 97 DF,  p-value: < 2.2e-16
## 81% of the covariates could be interpreted by the model

model2 <- lm(data$y~data$x)
summary(model2)
## 
## Call:
## lm(formula = data$y ~ data$x)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.4543 -1.5637 -0.1406  1.3773  5.9541 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   9.8389     0.2348  41.899  < 2e-16 ***
## data$x        1.5803     0.2467   6.405 5.23e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.299 on 98 degrees of freedom
## Multiple R-squared:  0.2951, Adjusted R-squared:  0.2879 
## F-statistic: 41.03 on 1 and 98 DF,  p-value: 5.225e-09

Task 2 - Transformations of the response

data <- read.table(file="/Users/imac/Desktop/master/notes/DA/data/data3d.csv",header = TRUE,sep=',')
head(data)
##          y         x1         x2
## 1 336.4601 -0.2413547  0.6010761
## 2 484.3973  0.3181198 -0.7707824
## 3 819.6034  1.5871393 -2.7014637
## 4 477.6218 -0.8070962 -1.7738784
## 5 296.2541 -0.8613912  0.5179591
## 6 587.1615  0.7420470 -1.3112961
pairs(data)

model <- lm(data$y~data$x1+data$x2)
summary(model)
## 
## Call:
## lm(formula = data$y ~ data$x1 + data$x2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -16.138  -7.402  -3.350   3.631  70.300 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 407.5017     0.5803   702.2   <2e-16 ***
## data$x1      79.1261     0.5964   132.7   <2e-16 ***
## data$x2     -80.0372     0.5531  -144.7   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.6 on 397 degrees of freedom
## Multiple R-squared:  0.9897, Adjusted R-squared:  0.9897 
## F-statistic: 1.914e+04 on 2 and 397 DF,  p-value: < 2.2e-16
qqnorm(residuals(model))

## all sigificant
## not straight, do not fit a good model

Ysqrt <- sqrt(data$y)
model <- lm(Ysqrt~data$x1+data$x2)
summary(model)
## 
## Call:
## lm(formula = Ysqrt ~ data$x1 + data$x2)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.205864 -0.055455 -0.008129  0.053999  0.238598 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 19.984667   0.004549  4392.8   <2e-16 ***
## data$x1      1.996896   0.004676   427.1   <2e-16 ***
## data$x2     -2.003956   0.004336  -462.2   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.09091 on 397 degrees of freedom
## Multiple R-squared:  0.999,  Adjusted R-squared:  0.999 
## F-statistic: 1.967e+05 on 2 and 397 DF,  p-value: < 2.2e-16
qqnorm(residuals(model))

## Sqrt will do it here. Note the high R2 on the original model, even though it is not approrpiate.
## commonly used tranformations include natural log and square root

Task 3 - catogorical covariates and ANOVA

data <- read.table(file="/Users/imac/Desktop/master/notes/DA/data/data3e.csv",header = TRUE,sep=',')
head(data)
##            Y x1 x2
## 1 -2.2897502  1  A
## 2 -0.6702734  1  B
## 3  3.0457225  1  C
## 4 -2.2411011  1  A
## 5  0.8077846  1  B
## 6  6.8333579  1  C
model <- lm(data$Y~factor(data$x1) + factor(data$x2))

Solving the problem of a case in that there are (\(\beat_1,\beat_2,\beat_3\)), and only two group, like (M,F), only 2 parameters can be estimated;

Two factors, first has 3 levels, and sencond has 3 levels, drop the first level, \(\beta_j\) tell you whether the ith level differs from the first one.

summary(model) 
## 
## Call:
## lm(formula = data$Y ~ factor(data$x1) + factor(data$x2))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.4502 -1.1666 -0.0567  1.0838  4.4454 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -0.2478     0.3190  -0.777 0.438417    
## factor(data$x1)2  -0.1435     0.3722  -0.386 0.700331    
## factor(data$x1)3  -1.4319     0.4050  -3.535 0.000548 ***
## factor(data$x1)4   0.6518     0.4050   1.609 0.109769    
## factor(data$x2)B   0.1222     0.3508   0.348 0.728016    
## factor(data$x2)C   4.2253     0.3508  12.043  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.754 on 144 degrees of freedom
## Multiple R-squared:  0.5936, Adjusted R-squared:  0.5795 
## F-statistic: 42.07 on 5 and 144 DF,  p-value: < 2.2e-16

we can use an ANOVA table to check if the factor variable affect the response

qqnorm(residuals(model))

anova(model)
## Analysis of Variance Table
## 
## Response: data$Y
##                  Df Sum Sq Mean Sq F value    Pr(>F)    
## factor(data$x1)   3  68.97  22.989   7.474 0.0001099 ***
## factor(data$x2)   2 578.05 289.024  93.964 < 2.2e-16 ***
## Residuals       144 442.93   3.076                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## The p-value gives the overall significance of the factor

you can adjust tests for doing multiple comparisons, such as Bonferonni or Tukey’s honest significant difference

aov1 <- aov(model)
TukeyHSD(aov1)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = model)
## 
## $`factor(data$x1)`
##            diff        lwr        upr     p adj
## 2-1 -0.01859204 -0.9856368  0.9484527 0.9999550
## 3-1 -1.37639970 -2.4291847 -0.3236147 0.0048139
## 4-1  0.70731559 -0.3454694  1.7601006 0.3037504
## 3-2 -1.35780766 -2.4588366 -0.2567788 0.0089199
## 4-2  0.72590763 -0.3751213  1.8269365 0.3203405
## 4-3  2.08371530  0.9066659  3.2607647 0.0000536
## 
## $`factor(data$x2)`
##          diff        lwr       upr     p adj
## B-A 0.1222221 -0.7084593 0.9529034 0.9352979
## C-A 4.2228248  3.3921434 5.0535061 0.0000000
## C-B 4.1006027  3.2699214 4.9312840 0.0000000
## we wanna know does the factor(gender) affect the response? Look the Pvalue
  • However, you may also be interested in which pairs of levels are significantly different, not just whether there is an overall difference between some of the levels.

  • The summary(model) output gives you the significance of differences between the first level and the rest, but what about other comparisons.

  • You could address this by doing individual t-tests between each pair of levels. However, given this involves doing multiple tests, you fall victim to the multiple testing problem (see next slide but one).

  • Therefore you need to adjust the tests to allow for the fact you are doing multiple tests. One option is Tukey’s honest significant difference method to adjust for multiple comparisons.

Task 4 - Interactions

#### data3f two continuous variables
data <- read.table(file="/Users/imac/Desktop/master/notes/DA/data/data3f.csv",header = TRUE,sep=',')
pairs(data)

model <- lm(data$y~data$x*data$z)
summary(model)
## 
## Call:
## lm(formula = data$y ~ data$x * data$z)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.45817 -0.88665 -0.04824  0.80153  2.90444 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    10.0593     0.1100   91.48   <2e-16 ***
## data$x          1.9150     0.1202   15.94   <2e-16 ***
## data$z          2.8139     0.1096   25.68   <2e-16 ***
## data$x:data$z   5.0289     0.0985   51.05   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.09 on 96 degrees of freedom
## Multiple R-squared:  0.9759, Adjusted R-squared:  0.9752 
## F-statistic:  1297 on 3 and 96 DF,  p-value: < 2.2e-16
## all significant

#### data3g one continuous and one factor
data <- read.table(file="/Users/imac/Desktop/master/notes/DA/data/data3g.csv",header = TRUE,sep=',')
pairs(data)

model <- lm(data$y~data$z*factor(data$x))
summary(model)
## 
## Call:
## lm(formula = data$y ~ data$z * factor(data$x))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.0599 -0.5829 -0.0179  0.5330  1.9672 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              6.7338     0.1834  36.710  < 2e-16 ***
## data$z                   2.9624     0.1607  18.435  < 2e-16 ***
## factor(data$x)2          4.8697     0.2517  19.345  < 2e-16 ***
## factor(data$x)3          3.4420     0.2514  13.689  < 2e-16 ***
## factor(data$x)4          8.5884     0.2620  32.778  < 2e-16 ***
## data$z:factor(data$x)2   0.7821     0.2293   3.411 0.000963 ***
## data$z:factor(data$x)3   1.9304     0.2424   7.964 4.28e-12 ***
## data$z:factor(data$x)4   3.0582     0.2210  13.837  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8573 on 92 degrees of freedom
## Multiple R-squared:  0.978,  Adjusted R-squared:  0.9763 
## F-statistic: 584.2 on 7 and 92 DF,  p-value: < 2.2e-16
## four levels

#### data3h two factors
data <- read.table(file="/Users/imac/Desktop/master/notes/DA/data/data3h.csv",header = TRUE,sep=',')
pairs(data)

interaction.plot(data$x, data$z, data$y)

model <- lm(data$y~factor(data$z)+factor(data$x))
summary(model)
## 
## Call:
## lm(formula = data$y ~ factor(data$z) + factor(data$x))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.37115 -0.91130  0.07302  0.63060  2.78511 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       9.7701     0.3977  24.565  < 2e-16 ***
## factor(data$z)2   3.6655     0.3977   9.216 1.13e-12 ***
## factor(data$z)3   8.6797     0.3977  21.823  < 2e-16 ***
## factor(data$x)2   6.0359     0.4593  13.143  < 2e-16 ***
## factor(data$x)3   6.2876     0.4593  13.691  < 2e-16 ***
## factor(data$x)4  12.2975     0.4593  26.777  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.258 on 54 degrees of freedom
## Multiple R-squared:  0.9568, Adjusted R-squared:  0.9529 
## F-statistic: 239.5 on 5 and 54 DF,  p-value: < 2.2e-16
aov1 <- aov(model)
TukeyHSD(aov1)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = model)
## 
## $`factor(data$z)`
##         diff      lwr      upr p adj
## 2-1 3.665494 2.706973 4.624016     0
## 3-1 8.679652 7.721130 9.638173     0
## 3-2 5.014158 4.055636 5.972679     0
## 
## $`factor(data$x)`
##           diff        lwr       upr     p adj
## 2-1  6.0359126  4.8184757  7.253350 0.0000000
## 3-1  6.2875708  5.0701339  7.505008 0.0000000
## 4-1 12.2975327 11.0800957 13.514970 0.0000000
## 3-2  0.2516582 -0.9657787  1.469095 0.9467044
## 4-2  6.2616201  5.0441831  7.479057 0.0000000
## 4-3  6.0099619  4.7925249  7.227399 0.0000000
## each level aganist the first one, if you wanna kown like 2-3, using TukeyHSD

Lab3

1. Are the mean iron percentages of the sub-samples from each quarry equal to 35%, and are the mean percentages different between the two quarries x and y?

data <- read.table(file="/Users/imac/Desktop/master/notes/DA/data/practice1.csv",header = TRUE,sep=',')
head(data)
##          x        y
## 1 34.74371 37.36711
## 2 37.65544 36.86677
## 3 38.12820 35.59161
## 4 38.60884 34.76842
## 5 37.19325 30.35593
## 6 38.64730 32.92237
t.test(data$y, mu=35)
## 
##  One Sample t-test
## 
## data:  data$y
## t = 0.80581, df = 9, p-value = 0.4411
## alternative hypothesis: true mean is not equal to 35
## 95 percent confidence interval:
##  33.86921 37.38212
## sample estimates:
## mean of x 
##  35.62567
t.test(data$x, mu=35)
## 
##  One Sample t-test
## 
## data:  data$x
## t = 4.6434, df = 9, p-value = 0.001213
## alternative hypothesis: true mean is not equal to 35
## 95 percent confidence interval:
##  36.30295 38.77849
## sample estimates:
## mean of x 
##  37.54072
# So the mean of x is different from 35 but the mean of y may not be.

t.test(data$x, data$y, paired=FALSE)
## 
##  Welch Two Sample t-test
## 
## data:  data$x and data$y
## t = 2.0161, df = 16.17, p-value = 0.06072
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.09687336  3.92697764
## sample estimates:
## mean of x mean of y 
##  37.54072  35.62567
#The two quarries samples’ do not have different means.

4 Marks 1 mark for the two one sample t-tests against the mean of 35 1 mark for correct conclusions 1 mark for the unpaired two sample t-test 1 mark for the conclusions

2. Two hundred observations were made on variables X and Y, and the results are stored in practice2.csv. What is the relationship between Y and X?

data <- read.table(file="/Users/imac/Desktop/master/notes/DA/data/practice2.csv",header = TRUE,sep=',')
head(data)
##          Y   X
## 1 7.692856 100
## 2 8.495202 100
## 3 8.040081 100
## 4 6.216443 100
## 5 7.758054 100
## 6 6.987696 100
# This shows that X is categorical and thus neither a linear relationship nor a correlation coefficient would be appropriate. Thus a linear model with a factor relationship looks appropriate. Fitting this model gives:

boxplot(data$Y~data$X)

mod <- lm(Y~factor(X), data=data)
summary(mod)
## 
## Call:
## lm(formula = Y ~ factor(X), data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.68873 -0.49025 -0.05642  0.55036  2.12961 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    5.6837     0.1543  36.833   <2e-16 ***
## factor(X)100   1.7489     0.1890   9.254   <2e-16 ***
## factor(X)120   4.3412     0.1782  24.364   <2e-16 ***
## factor(X)130   1.8885     0.1890   9.993   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7716 on 196 degrees of freedom
## Multiple R-squared:  0.7967, Adjusted R-squared:  0.7935 
## F-statistic:   256 on 3 and 196 DF,  p-value: < 2.2e-16
# The factor is clearly significant, to see which levels are different from each other we do a Tukey’s pairwise comparison, showing only 100 and 130 as similar:

aov <- aov(mod)
TukeyHSD(aov)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = mod)
## 
## $`factor(X)`
##              diff        lwr        upr     p adj
## 100-90   1.748896  1.2591769  2.2386147 0.0000000
## 120-90   4.341247  3.8795360  4.8029587 0.0000000
## 130-90   1.888527  1.3988079  2.3782457 0.0000000
## 120-100  2.592352  2.2273366  2.9573664 0.0000000
## 130-100  0.139631 -0.2602228  0.5394848 0.8022646
## 130-120 -2.452721 -2.8177354 -2.0877056 0.0000000

4 marks 2 marks for the plot and saying X is a factor 1 mark for fitting the one-way anova model 1 marks for the multiple comparison pairwise differences and the conclusions.

3. What is the relationship between mass and speed for these rodents?

data <- read.table(file="/Users/imac/Desktop/master/notes/DA/data/practice3.csv",header = TRUE,sep=',')
head(data)
##   Mass Speed
## 1 9.00   3.2
## 2 4.00  16.0
## 3 0.60  36.0
## 4 0.60  20.0
## 5 0.55  27.0
## 6 0.50  18.0
plot(data$Mass, data$Speed)

# As a result, fitting a linear regression model (with speed as the response) with and without the outlier gives different results

mod1 <- lm(formula = data$Speed ~ data$Mass)
summary(mod1)
## 
## Call:
## lm(formula = data$Speed ~ data$Mass)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -9.361 -5.702 -2.156  4.021 20.413 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  16.1908     1.7615   9.192  8.3e-09 ***
## data$Mass    -1.0054     0.8504  -1.182     0.25    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.892 on 21 degrees of freedom
## Multiple R-squared:  0.0624, Adjusted R-squared:  0.01776 
## F-statistic: 1.398 on 1 and 21 DF,  p-value: 0.2503
c(mod1$coefficients[2], confint(mod1, parm=2))
##  data$Mass                       
## -1.0054370 -2.7740297  0.7631556
mod2 <- lm(formula = data$Speed[-c(1, 2)]  ~ data$Mass[-c(1, 2)] )
summary(mod2)
## 
## Call:
## lm(formula = data$Speed[-c(1, 2)] ~ data$Mass[-c(1, 2)])
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -7.568 -4.552 -1.634  2.902 19.276 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           11.984      1.957   6.125 6.89e-06 ***
## data$Mass[-c(1, 2)]   21.135      6.940   3.046  0.00665 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.617 on 19 degrees of freedom
## Multiple R-squared:  0.3281, Adjusted R-squared:  0.2927 
## F-statistic: 9.276 on 1 and 19 DF,  p-value: 0.006653
c(mod2$coefficients[2], confint(mod2, parm=2))
## data$Mass[-c(1, 2)]                                         
##           21.135432            6.610777           35.660087
# The first analysis shows no relationship between speed and mass, while the second shows that for every one kilogram heavier a rodent is the speed increases by 21 metres per second. Given that the 2 data points look like outliers, the second analysis looks more reasonable.

4 Marks 1 mark for the graph. 1 mark for recognizing outliers. 1 mark for fitting a model without the outliers. 1 mark for the conclusions about the increase in speed per increase in mass. This should include estimates and confidence intervals.

4. The weight of 50 people was measured in kilos (kg) before and after Christmas, and the results are stored in test1.csv in columns before and after respectively. Is the mean body weight of the group of 50 people before Christmas equal to 70 kg?

test1dframe <- read.table(file="/Users/imac/Desktop/master/notes/DA/data/test1.csv",header = TRUE,sep=',')
t.test(test1dframe$before, mu=70)
## 
##  One Sample t-test
## 
## data:  test1dframe$before
## t = 9.3334, df = 49, p-value = 1.889e-12
## alternative hypothesis: true mean is not equal to 70
## 95 percent confidence interval:
##  70.93783 71.45249
## sample estimates:
## mean of x 
##  71.19516
# p-value = 1.889e-12, so the null hypothesis is rejected, therefore the mean body weight of the group of 70 people before Christmas is not equal to 70kg.

Are the mean body weights of the group before and after Christmas different?

t.test(test1dframe$before, test1dframe$after, paired=TRUE)
## 
##  Paired t-test
## 
## data:  test1dframe$before and test1dframe$after
## t = -7.2859, df = 49, p-value = 2.395e-09
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -2.862032 -1.624557
## sample estimates:
## mean of the differences 
##               -2.243295
# p-value = 2.395e-09, so the null hypothesis is rejected, thus the mean body weights of the group before and after Christmas are not equal.

4 Marks 1 mark for the two one sample t-tests against the mean of 70 1 mark for conclusions 1 mark for the paired two sample t-test 1 mark for the conclusions

5. A colleague gives you a data set with a response variable y and a covariate x, and these data are stored in test2.csv. Explore if x and y depend on each other. Fit a simple linear regression on x and y, and report whether x appears to be a significant predictor of y according to the fitted regression. Propose a transformation of the response and fit a linear model on x and the transformed response. Does the latter model, based on the transformed response, account for more variance of the data?

test2dframe <- read.table(file="/Users/imac/Desktop/master/notes/DA/data/test2.csv",header = TRUE,sep=',')
test2model1 <- lm(test2dframe$y ~ test2dframe$x)
summary(test2model1)
## 
## Call:
## lm(formula = test2dframe$y ~ test2dframe$x)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -25.129 -15.760   0.621   8.801  44.855 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -89.7092     6.5906  -13.61   <2e-16 ***
## test2dframe$x  21.5683     0.4965   43.44   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 18.4 on 78 degrees of freedom
## Multiple R-squared:  0.9603, Adjusted R-squared:  0.9598 
## F-statistic:  1887 on 1 and 78 DF,  p-value: < 2.2e-16
# The p value for the two coefficient of x is very small, so x is a significant predictor of y.

# It appears that y depends on the square of x, therefore a square root transformation of the response y seems plausible.
z <- sqrt(test2dframe$y)

test2model2 <- lm(z ~ test2dframe$x)

summary(test2model2)
## 
## Call:
## lm(formula = z ~ test2dframe$x)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.48969 -0.12979 -0.02777  0.10251  0.97823 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   0.855642   0.086312   9.913 1.85e-15 ***
## test2dframe$x 0.955510   0.006502 146.955  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.241 on 78 degrees of freedom
## Multiple R-squared:  0.9964, Adjusted R-squared:  0.9964 
## F-statistic: 2.16e+04 on 1 and 78 DF,  p-value: < 2.2e-16
# The adjusted R-squared of the initial model was already high, but the second model that is based on the transformation exhibits higher R-squared, so it seems that the latter model accounts for more variance of the data.

4 Marks 1 mark for a graph to demonstrate dependence of y on x 1 mark for fitting a simple linear regression and for concluding that x is a significant predictor of y (0.5+0.5) 1 mark for proposing the square root transform of the response (0.5 for any other reasonable proposal) 1 mark for fitting the new model using the transformed response and for justifying that the new model accounts for more variance due to increased adjusted R- squared (0.5+0.5)

6. The product y and the reactants x and z of a chemical reaction were measured in mmol per litre. Furthermore, the temperature t was recorded in Celsius. These data are available in test3.csv. Explore visually and report the relations between all involved variables. Fit a linear model to the data by including all covariates. Pick an alternative model, reasoning about your choice, and find out which of the two models should be preferred. Justify whether the assumptions for each of the fitted models seem valid.

test3dframe <- read.table(file="/Users/imac/Desktop/master/notes/DA/data/test3.csv",header = TRUE,sep=',')

pairs(test3dframe)

# There appears to be negative correlation between y and x as well as y and z, but no other apparent correlation is seen in the scatterplots.

test3model1 <- lm(test3dframe$y ~ test3dframe$x+test3dframe$z+test3dframe$t)

summary(test3model1)
## 
## Call:
## lm(formula = test3dframe$y ~ test3dframe$x + test3dframe$z + 
##     test3dframe$t)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.44590 -0.25089 -0.06159  0.19289  0.57854 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    0.995078   1.459226   0.682    0.500    
## test3dframe$x -0.991011   0.050846 -19.490  < 2e-16 ***
## test3dframe$z -1.029450   0.065644 -15.682 2.73e-16 ***
## test3dframe$t -0.001428   0.043267  -0.033    0.974    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3069 on 31 degrees of freedom
## Multiple R-squared:  0.9511, Adjusted R-squared:  0.9464 
## F-statistic: 201.1 on 3 and 31 DF,  p-value: < 2.2e-16
test3model2 <- lm(test3dframe$y ~ test3dframe$x+test3dframe$z)

summary(test3model2)
## 
## Call:
## lm(formula = test3dframe$y ~ test3dframe$x + test3dframe$z)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.44856 -0.25314 -0.06326  0.19182  0.57793 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    0.94777    0.26839   3.531  0.00128 ** 
## test3dframe$x -0.99092    0.04998 -19.827  < 2e-16 ***
## test3dframe$z -1.02957    0.06451 -15.961  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3021 on 32 degrees of freedom
## Multiple R-squared:  0.9511, Adjusted R-squared:  0.9481 
## F-statistic: 311.4 on 2 and 32 DF,  p-value: < 2.2e-16
qqnorm(residuals(test3model1))

# x and z seem to be significant predictors of y, though this is not the case for t. The adjusted R-squared and the qq-plot show that the normality assumption is not obviously invalid; the qq-plot in particular is relatively straight, although the tails deviate from the straight line.
qqnorm(residuals(test3model2))

# Since t is not a significant covariate in the first model and the adjusted R-squared is more or less the same between the two models, choose the more parsimonious model with fewer covariates (x and z).

7. The circuit lap times (in seconds) of professional drivers were measured and stored in test4.csv. The professional drivers were either F1 pilots or rally drivers. 3 car brands were used, labeled as c1, c2 and c3.

  • Which factors are involved in the problem, and how many levels each of them contain?

  • Fit a linear model and identify the significant levels. Which of the two factors are significant?

  • Of all possible pairs of car brands, which ones seem to have significantly different effect on lap times?

  • Implement a new model, this time including the interaction of driver type and car brand. Is such interaction an important factor to consider?

test4dframe <- read.table(file="/Users/imac/Desktop/master/notes/DA/data/test4.csv",header = TRUE,sep=',')
# There are two factors, namely type of car driver and car brand. The car driver type has two levers, F1 and rally driver. The car brand has three levels, c1, c2 and c3.
test4model1 <- lm(test4dframe$times ~ factor(test4dframe$driver)+factor(test4dframe$car))

summary(test4model1)
## 
## Call:
## lm(formula = test4dframe$times ~ factor(test4dframe$driver) + 
##     factor(test4dframe$car))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.8780 -0.6819  0.1190  0.7687  2.4023 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       55.7813     0.2414 231.061  < 2e-16 ***
## factor(test4dframe$driver)rally1  -0.1808     0.2477  -0.730    0.468    
## factor(test4dframe$car)c2          2.5122     0.3031   8.288  2.6e-11 ***
## factor(test4dframe$car)c3          6.1988     0.3031  20.452  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9577 on 56 degrees of freedom
## Multiple R-squared:  0.8832, Adjusted R-squared:  0.8769 
## F-statistic: 141.1 on 3 and 56 DF,  p-value: < 2.2e-16
# According to the summary, the significant levels are only the intercept, and c2, c3 levels-car brands (using c1 as the base level).

anova(test4model1)
## Analysis of Variance Table
## 
## Response: test4dframe$times
##                            Df Sum Sq Mean Sq  F value Pr(>F)    
## factor(test4dframe$driver)  1   0.00   0.003   0.0027 0.9584    
## factor(test4dframe$car)     2 388.29 194.144 211.6817 <2e-16 ***
## Residuals                  56  51.36   0.917                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# According to ANOVA, the car brand appears to be an important factor, as opposed to the type of driver.

TukeyHSD(aov(test4model1))
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = test4model1)
## 
## $`factor(test4dframe$driver)`
##                 diff        lwr       upr     p adj
## rally1-f1 0.01295547 -0.4826653 0.5085763 0.9584246
## 
## $`factor(test4dframe$car)`
##           diff      lwr      upr p adj
## c2-c1 2.502493 1.773374 3.231612     0
## c3-c1 6.189129 5.460010 6.918248     0
## c3-c2 3.686636 2.957517 4.415754     0
# The Turkey multiple comparisons, reveal all pairs of car brands, i.e. c2-c1, c3-c1 and c3-c2 as significant, whereas there this is not the case for the rally vs F1 driver levels.
test4model2 <- lm(test4dframe$times ~ test4dframe$driver*test4dframe$car)

anova(test4model2)
## Analysis of Variance Table
## 
## Response: test4dframe$times
##                                    Df Sum Sq Mean Sq  F value Pr(>F)    
## test4dframe$driver                  1   0.00   0.003   0.0027 0.9586    
## test4dframe$car                     2 388.29 194.144 210.0925 <2e-16 ***
## test4dframe$driver:test4dframe$car  2   1.46   0.730   0.7898 0.4591    
## Residuals                          54  49.90   0.924                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# ANOVA does not show that the interaction between car brand and driver type is significant.

6 Marks 1 mark for identifying factors and their levels 1 mark for fitting times ~ car+driver 1 mark for summary 1 mark for anova 1 mark for TukeyHSD 1 mark for times ~ car*driver and for deducing that interactions between car brand and type of driver does not appear to be significant (0.5+0.5)